Preprocessing

# Load library:
library(plyr)
library(SummarizedExperiment)
library(readxl)
library(dplyr)
library(ggplot2)
library(pheatmap)
library("RColorBrewer")
library(DESeq2)
library(knitr)
library(kableExtra)

set.seed("12345")

# Load data:
se <- readRDS("/restricted/projectnb/pulmseq/Novartis/processing/work/28/da9dcb4ebb5f902c53fff1bc1ebdc2/Novartis_Gene_Expression.rds")
colnames(se) <- gsub(pattern = "_.*", "", colnames(se))

# Load annotation:

# Matches sample ID to kitnumber + Has batch info
annotManifest <- read_excel("/restricted/projectnb/pulmseq/Novartis/annotation/Novartis_RNASeq_run_manifest.xlsx")

# Matches sample ID to sample type/RIN (received from Jack, 8/24/20)
annotSampleEM0 <- read_excel("/restricted/projectnb/camplab/home/ykoga07/DECAMP/Novartis/Data/Annotation/RNA_data_for_Yusuke_08_24_20.xlsx", sheet = "EM0", skip = 2)

annotSampleEM5 <- read_excel("/restricted/projectnb/camplab/home/ykoga07/DECAMP/Novartis/Data/Annotation/RNA_data_for_Yusuke_08_24_20.xlsx", sheet = "EM5", skip = 2)

annotSampleEM6 <- read_excel("/restricted/projectnb/camplab/home/ykoga07/DECAMP/Novartis/Data/Annotation/RNA_data_for_Yusuke_08_24_20.xlsx", sheet = "EM6", skip = 2)

annotSampleEM8 <- read_excel("/restricted/projectnb/camplab/home/ykoga07/DECAMP/Novartis/Data/Annotation/RNA_data_for_Yusuke_08_24_20.xlsx", sheet = "EM8", skip = 3)

annotSampleEM15 <- read_excel("/restricted/projectnb/camplab/home/ykoga07/DECAMP/Novartis/Data/Annotation/RNA_data_for_Yusuke_08_24_20.xlsx", sheet = "EM15", skip = 3)

annotSampleEM16 <- read_excel("/restricted/projectnb/camplab/home/ykoga07/DECAMP/Novartis/Data/Annotation/RNA_data_for_Yusuke_08_24_20.xlsx", sheet = "EM16", skip = 3)

annotSampleN1 <- read_excel("/restricted/projectnb/camplab/home/ykoga07/DECAMP/Novartis/Data/Annotation/RNA_data_for_Yusuke_08_24_20.xlsx", sheet = "N1", skip = 3)

annotSampleN2 <- read_excel("/restricted/projectnb/camplab/home/ykoga07/DECAMP/Novartis/Data/Annotation/RNA_data_for_Yusuke_08_24_20.xlsx", sheet = "N2", skip = 3)

# Combine Jack's sample-based annotation file into one dataframe
annotSampleEM0 <- annotSampleEM0[, c(
  "RNA Sample ID", "DECAMP #",
  "Sample ID", "Kit Number", "Original Sample Type",
  "Isolation Date", "RIN", "DV200 (%)"
)]
colnames(annotSampleEM0) <- c(
  "RNA Sample ID", "DECAMP #",
  "Sample ID", "Kit Number", "Sample Type",
  "Isolation Date", "RIN", "DV200 (%)"
)

annotSampleEM5 <- annotSampleEM5[, c(
  "RNA Sample ID", "DECAMP #",
  "Patient ID", "Patient ID", "Sample Type",
  "Isolation Date", "RIN", "DV200 (%)"
)]
colnames(annotSampleEM5) <- c(
  "RNA Sample ID", "DECAMP #",
  "Sample ID", "Kit Number", "Sample Type",
  "Isolation Date", "RIN", "DV200 (%)"
)

annotSampleEM6 <- annotSampleEM6[, c(
  "RNA Sample ID", "DECAMP #",
  "Sample ID", "Kit Number", "Sample Type",
  "Isolation Date", "RIN", "DV200 (%)"
)]
colnames(annotSampleEM6) <- c(
  "RNA Sample ID", "DECAMP #",
  "Sample ID", "Kit Number", "Sample Type",
  "Isolation Date", "RIN", "DV200 (%)"
)

annotSampleEM8 <- annotSampleEM8[, c(
  "RNA Sample ID", "DECAMP #",
  "Sample ID", "Original Sample ID", "Sample Type",
  "Isolation Date", "RIN", "DV200 (%)"
)]
colnames(annotSampleEM8) <- c(
  "RNA Sample ID", "DECAMP #",
  "Sample ID", "Kit Number", "Sample Type",
  "Isolation Date", "RIN", "DV200 (%)"
)

annotSampleEM15 <- annotSampleEM15[, c(
  "RNA Sample ID", "DECAMP #",
  "Sample ID", "Kit Number", "Sample Type",
  "Isolation Date", "RIN", "DV200 (%)"
)]
colnames(annotSampleEM15) <- c(
  "RNA Sample ID", "DECAMP #",
  "Sample ID", "Kit Number", "Sample Type",
  "Isolation Date", "RIN", "DV200 (%)"
)

annotSampleEM16 <- annotSampleEM16[, c(
  "RNA Sample ID", "DECAMP #",
  "Sample ID", "Kit Number", "Sample Type",
  "Isolation Date", "RIN", "DV200 (%)"
)]
colnames(annotSampleEM16) <- c(
  "RNA Sample ID", "DECAMP #",
  "Sample ID", "Kit Number", "Sample Type",
  "Isolation Date", "RIN", "DV200 (%)"
)

annotSampleN1 <- annotSampleN1[, c(
  "RNA Sample ID", "DECAMP #",
  "Sample ID", "Kit Number", "Sample Type",
  "Isolation Date", "RIN", "DV200 (%)"
)]
colnames(annotSampleN1) <- c(
  "RNA Sample ID", "DECAMP #",
  "Sample ID", "Kit Number", "Sample Type",
  "Isolation Date", "RIN", "DV200 (%)"
)

annotSampleN2 <- annotSampleN2[, c(
  "RNA Sample ID", "DECAMP #",
  "Sample ID", "Kit Number", "Sample Type",
  "Isolation Date", "RIN", "DV200 (%)"
)]
colnames(annotSampleN2) <- c(
  "RNA Sample ID", "DECAMP #",
  "Sample ID", "Kit Number", "Sample Type",
  "Isolation Date", "RIN", "DV200 (%)"
)

allAnnotSamples <- ls()[grep("annotSample", ls())]
annotSample <- as.data.frame(matrix(ncol = 8))
colnames(annotSample) <- c(
  "RNA Sample ID", "DECAMP #",
  "Sample ID", "Kit Number", "Sample Type",
  "Isolation Date", "RIN", "DV200 (%)"
)
# Combine
for (x in allAnnotSamples) {
  annotSamples <- get(x)
  annotSamples$`Isolation Date` <- as.character(annotSamples$`Isolation Date`)
  annotSample <- rbind(annotSample, annotSamples)
}
annotSample <- annotSample[-which(is.na(annotSample$`RNA Sample ID`)), ]
annotSample <- annotSample[-which(is.na(annotSample$`Sample Type`)), ]

annotSample <- annotSample[-which(duplicated(annotSample$`RNA Sample ID`)), ]

# Save which samples had wrong rRNA peak when checking RIN
wrongRRNA.ix <- grep(pattern = "rong rRNA peak", annotSample$RIN)
annotSample$wrongRNAPeak <- 0
annotSample$wrongRNAPeak[wrongRRNA.ix] <- 1

# Edit RIN section to convert into numeric vector
annotSample$RIN <- gsub(pattern = "(wrong rRNA peak)", replacement = "", annotSample$RIN, fixed = TRUE)
annotSample$RIN <- gsub(pattern = "(wrong RNA peak)", replacement = "", annotSample$RIN, fixed = TRUE)
annotSample$RIN <- gsub(pattern = "(wrong rRNA peaks)", replacement = "", annotSample$RIN, fixed = TRUE)
annotSample$RIN <- gsub(pattern = "(Wrong rRNA peak)", replacement = "", annotSample$RIN, fixed = TRUE)
annotSample$RIN <- gsub(pattern = "(Wrong rRNA peaks)", replacement = "", annotSample$RIN, fixed = TRUE)
annotSample$RIN <- gsub(pattern = "\\/NA", replacement = "", annotSample$RIN)
annotSample$RIN <- gsub(pattern = ".*\\/", replacement = "", annotSample$RIN)
annotSample$RIN <- as.numeric(annotSample$RIN)
# There is a "29", probably 2.9, will ask Jack
annotSample$RIN[annotSample$RIN == 29] <- 2.9

# Consolidate nasal/bronch terms
annotSample$`Sample Type` <- gsub(pattern = ".*Bronch.*", replacement = "Bronchial Brushings", annotSample$`Sample Type`)
annotSample$`Sample Type` <- gsub(pattern = ".*Nasal.*", replacement = "Nasal Brushings", annotSample$`Sample Type`)

annotManifest <- filter(annotManifest, Run == "Run 1")
annotSample <- annotSample[annotSample$`RNA Sample ID` %in% annotManifest$Sample, ]
annotSample <- annotSample[match(annotSample$`RNA Sample ID`, annotManifest$Sample), ]

# Removing duplicate column
annotManifest$`Kit Number` <- NULL

annot <- cbind(annotManifest, annotSample)

# Clinical info:

# Matches kitnumber to randID, "Novartis_datadump"
annotDataDump2 <- read_excel("/rprojectnb/decamp/annotation/20200920/FullDECAMP2_DataDump_20200920.xlsx", sheet = "_D2CASEMAP", )

# randID to demographics info, "Novartis_datadump"
annotDataDump3 <- read_excel("/rprojectnb/decamp/annotation/20200920/FullDECAMP2_DataDump_20200920.xlsx", sheet = "_D2DEMO")

# randID to age/smoking info, "Novartis_datadump"
annotDataDump4 <- read_excel("/rprojectnb/decamp/annotation/20200920/FullDECAMP2_DataDump_20200920.xlsx", sheet = "_D2ELIGCHECK")

# randID to FEV/FVC, "Novartis_datadump"
annotDataDump5 <- read_excel("/rprojectnb/decamp/annotation/20200920/FullDECAMP2_DataDump_20200920.xlsx", sheet = "_D2PFT")
annotDataDump5 <- filter(annotDataDump5, timepoint == "Baseline Visit")

# Combine annotation info
annotA <- merge(merge(annotDataDump2, annotDataDump3, by.x = "randID", by.y = "randID"), annotDataDump4, by.x = "randID", by.y = "randID")
annotA <- merge(annotA, annotDataDump5, by.x = "randID", by.y = "randID")

# Removing patient "4796-053" as "Pacific Islander" for time being
annotA <- annotA[!(duplicated(annotA$randID)), ]

annot <- merge(annot, annotA, by.x = "Kit Number", by.y = "kitnumber", all.x = TRUE)

annotBronch <- annot[annot$`Sample Type` == "Bronchial Brushings", ]
annotNasal <- annot[annot$`Sample Type` == "Nasal Brushings", ]

uniquePatients <- length(unique(annot$`Kit Number`))

numBronch <- length(unique(annotBronch$Sample))
numNasal <- length(unique(annotNasal$Sample))

bothBronchNasal <- length(intersect(annotBronch$`Kit Number`, annotNasal$`Kit Number`))

There are 280 samples and 201 unique patients in the dataset. 200 are bronchial samples, and 80 are nasal samples. There are 79 patients that have both bronchial and nasal samples.

# Samples flagged by Novartis as poor quality:
flaggedByNovartis <- c(
  "EM15-14", "EM15-31", "EM15-64", "EM15-85", "EM16-194", "EM16-195",
  "EM16-228", "EM5-100", "EM8-112", "EM8-130", "EM8-158", "EM8-171",
  "EM8-26", "EM8-55", "EM8-66", "N1-115", "N1-119", "N1-125",
  "N1-139", "N1-25", "N1-40", "N1-55", "N1-63", "N1-64",
  "N1-67", "N1-81", "N1-83", "N1-84", "N1-94", "N2-10",
  "N2-15", "N2-18", "N2-30", "N2-31", "N2-22", "N2-28"
)
seBronch <- se[, which(colnames(se) %in% annotBronch$`RNA Sample ID`)]

annotBronch <- annotBronch[annotBronch$`RNA Sample ID` %in% colnames(se), ]

# reorder
annotBronch <- annotBronch[match(colnames(seBronch), annotBronch$`RNA Sample ID`), ]

coldataBronch <- colData(seBronch)

Violin Plots, QC Metrics

RIN

dots <- T
boxplot <- F
violin <- T

df <- data.frame(
  x = annotBronch$`Sample Type`,
  y = annotBronch$RIN
)

p <- ggplot2::ggplot(df) +
  ggplot2::aes_string(
    x = "x",
    y = "y"
  )
if (dots == TRUE) {
  p <- p + ggplot2::geom_jitter(
    color = "blue",
    width = 0.2,
    height = 0,
    size = 1,
    alpha = 1
  )
}
if (boxplot == TRUE) {
  p <- p + ggplot2::geom_boxplot(
    width = 0.5,
    alpha = 0
  )
}
if (violin == TRUE) {
  p <- p + ggplot2::geom_violin(
    trim = TRUE,
    scale = "width",
    size = 1,
    fill = "grey",
    alpha = 0.75
  )
}

p <- p + ggplot2::theme_bw() +
  ggplot2::theme(
    panel.grid.major = ggplot2::element_blank(),
    panel.grid.minor = ggplot2::element_blank(),
    axis.text = ggplot2::element_text(size = 14),
    axis.title = ggplot2::element_text(size = 10)
  ) + ggplot2::theme(axis.text.x = ggplot2::element_text(size = 15)) +
  ggplot2::ylab("RIN") +
  ggplot2::theme(
    axis.title.y = element_text(size = 15),
    axis.title.x = ggplot2::element_blank()
  )

p <- p + ggplot2::ggtitle(label = "Bronchial Brushings") +
  ggplot2::theme(plot.title = ggplot2::element_text(
    hjust = 0.5,
    size = 18
  ))

p

Genes detected

dots <- T
boxplot <- T
violin <- T

df <- data.frame(
  x = annotBronch$`Sample Type`,
  y = colSums(assay(seBronch) > 0)
)

p <- ggplot2::ggplot(df) +
  ggplot2::aes_string(
    x = "x",
    y = "y"
  )
if (dots == TRUE) {
  p <- p + ggplot2::geom_jitter(
    color = "blue",
    width = 0.2,
    height = 0,
    size = 1,
    alpha = 1
  )
}
if (boxplot == TRUE) {
  p <- p + ggplot2::geom_boxplot(
    width = 0.5,
    alpha = 0
  )
}
if (violin == TRUE) {
  p <- p + ggplot2::geom_violin(
    trim = TRUE,
    scale = "width",
    size = 1,
    fill = "grey",
    alpha = 0.75
  )
}

p <- p + ggplot2::theme_bw() +
  ggplot2::theme(
    panel.grid.major = ggplot2::element_blank(),
    panel.grid.minor = ggplot2::element_blank(),
    axis.text = ggplot2::element_text(size = 10),
    axis.title = ggplot2::element_text(size = 10)
  ) + ggplot2::theme(axis.text.x = ggplot2::element_text(size = 15)) +
  ggplot2::ylab("# Genes detected") +
  ggplot2::theme(
    axis.title.y = element_text(size = 15),
    axis.title.x = ggplot2::element_blank()
  )

p <- p + ggplot2::ggtitle(label = "Number of genes detected") +
  ggplot2::theme(plot.title = ggplot2::element_text(
    hjust = 0.5,
    size = 18
  ))

summary <- "median"
summ <- df %>%
  dplyr::group_by(x) %>%
  dplyr::summarize(value = stats::median(y))
fun <- stats::median

summ$statY <- max(df$y) + (max(df$y) - min(df$y)) * 0.1
summary <- paste(toupper(substr(summary, 1, 1)),
  substr(summary, 2, nchar(summary)),
  sep = ""
)
summ$label <- paste0(summary, ": ", round(summ$value, 5))

p <- p + ggplot2::geom_text(
  data = summ,
  ggplot2::aes_string(
    x = "x",
    y = "statY",
    label = "label"
  ),
  size = 5
)
p <- p + ggplot2::stat_summary(
  fun = fun, fun.min = fun,
  fun.max = fun,
  geom = "crossbar",
  color = "red",
  linetype = "dashed"
)

p

Percent GC

df <- data.frame(
  x = annotBronch$`Sample Type`,
  y = unlist(lapply(coldataBronch$FastQC_percent_GC, FUN = mean))
)

p <- ggplot2::ggplot(df) +
  ggplot2::aes_string(
    x = "x",
    y = "y"
  )
if (dots == TRUE) {
  p <- p + ggplot2::geom_jitter(
    color = "blue",
    width = 0.2,
    height = 0,
    size = 1,
    alpha = 1
  )
}
if (boxplot == TRUE) {
  p <- p + ggplot2::geom_boxplot(
    width = 0.5,
    alpha = 0
  )
}
if (violin == TRUE) {
  p <- p + ggplot2::geom_violin(
    trim = TRUE,
    scale = "width",
    size = 1,
    fill = "grey",
    alpha = 0.75
  )
}

p <- p + ggplot2::theme_bw() +
  ggplot2::theme(
    panel.grid.major = ggplot2::element_blank(),
    panel.grid.minor = ggplot2::element_blank(),
    axis.text = ggplot2::element_text(size = 10),
    axis.title = ggplot2::element_text(size = 10)
  ) + ggplot2::theme(axis.text.x = ggplot2::element_text(size = 15)) +
  ggplot2::ylab("Mean GC content (%)") +
  ggplot2::theme(
    axis.title.y = element_text(size = 15),
    axis.title.x = ggplot2::element_blank()
  )

p <- p + ggplot2::ggtitle(label = "FastQC, GC content (%)") +
  ggplot2::theme(plot.title = ggplot2::element_text(
    hjust = 0.5,
    size = 18
  ))

summary <- "median"
summ <- df %>%
  dplyr::group_by(x) %>%
  dplyr::summarize(value = stats::median(y))
fun <- stats::median

summ$statY <- max(df$y) + (max(df$y) - min(df$y)) * 0.1
summary <- paste(toupper(substr(summary, 1, 1)),
  substr(summary, 2, nchar(summary)),
  sep = ""
)
summ$label <- paste0(summary, ": ", round(summ$value, 5))

p <- p + ggplot2::geom_text(
  data = summ,
  ggplot2::aes_string(
    x = "x",
    y = "statY",
    label = "label"
  ),
  size = 5
)
p <- p + ggplot2::stat_summary(
  fun = fun, fun.min = fun,
  fun.max = fun,
  geom = "crossbar",
  color = "red",
  linetype = "dashed"
)

p

Percentage aligned to exons

df <- data.frame(
  x = annotBronch$`Sample Type`,
  y = (coldataBronch$RSeQC_cds_exons_tag_pct +
    coldataBronch$RSeQC_3_utr_exons_tag_pct +
    coldataBronch$RSeQC_5_utr_exons_tag_pct)
)

p <- ggplot2::ggplot(df) +
  ggplot2::aes_string(
    x = "x",
    y = "y"
  )
if (dots == TRUE) {
  p <- p + ggplot2::geom_jitter(
    color = "blue",
    width = 0.2,
    height = 0,
    size = 1,
    alpha = 1
  )
}
if (boxplot == TRUE) {
  p <- p + ggplot2::geom_boxplot(
    width = 0.5,
    alpha = 0
  )
}
if (violin == TRUE) {
  p <- p + ggplot2::geom_violin(
    trim = TRUE,
    scale = "width",
    size = 1,
    fill = "grey",
    alpha = 0.75
  )
}

p <- p + ggplot2::theme_bw() +
  ggplot2::theme(
    panel.grid.major = ggplot2::element_blank(),
    panel.grid.minor = ggplot2::element_blank(),
    axis.text = ggplot2::element_text(size = 10),
    axis.title = ggplot2::element_text(size = 10)
  ) + ggplot2::theme(axis.text.x = ggplot2::element_text(size = 15)) +
  ggplot2::ylab("Aligned to exonic region (%)") +
  ggplot2::theme(
    axis.title.y = element_text(size = 15),
    axis.title.x = ggplot2::element_blank()
  )

p <- p + ggplot2::ggtitle(label = "RSeQC, Percent aligned to exons") +
  ggplot2::theme(plot.title = ggplot2::element_text(
    hjust = 0.5,
    size = 18
  ))

summary <- "median"
summ <- df %>%
  dplyr::group_by(x) %>%
  dplyr::summarize(value = stats::median(y))
fun <- stats::median

summ$statY <- max(df$y) + (max(df$y) - min(df$y)) * 0.1
summary <- paste(toupper(substr(summary, 1, 1)),
  substr(summary, 2, nchar(summary)),
  sep = ""
)
summ$label <- paste0(summary, ": ", round(summ$value, 5))

p <- p + ggplot2::geom_text(
  data = summ,
  ggplot2::aes_string(
    x = "x",
    y = "statY",
    label = "label"
  ),
  size = 5
)
p <- p + ggplot2::stat_summary(
  fun = fun, fun.min = fun,
  fun.max = fun,
  geom = "crossbar",
  color = "red",
  linetype = "dashed"
)

p

Percentage aligned to introns

df <- data.frame(
  x = annotBronch$`Sample Type`,
  y = coldataBronch$RSeQC_introns_tag_pct
)

p <- ggplot2::ggplot(df) +
  ggplot2::aes_string(
    x = "x",
    y = "y"
  )
if (dots == TRUE) {
  p <- p + ggplot2::geom_jitter(
    color = "blue",
    width = 0.2,
    height = 0,
    size = 1,
    alpha = 1
  )
}
if (boxplot == TRUE) {
  p <- p + ggplot2::geom_boxplot(
    width = 0.5,
    alpha = 0
  )
}
if (violin == TRUE) {
  p <- p + ggplot2::geom_violin(
    trim = TRUE,
    scale = "width",
    size = 1,
    fill = "grey",
    alpha = 0.75
  )
}

p <- p + ggplot2::theme_bw() +
  ggplot2::theme(
    panel.grid.major = ggplot2::element_blank(),
    panel.grid.minor = ggplot2::element_blank(),
    axis.text = ggplot2::element_text(size = 10),
    axis.title = ggplot2::element_text(size = 10)
  ) + ggplot2::theme(axis.text.x = ggplot2::element_text(size = 15)) +
  ggplot2::ylab("Aligned to intronic region (%)") +
  ggplot2::theme(
    axis.title.y = element_text(size = 15),
    axis.title.x = ggplot2::element_blank()
  )

p <- p + ggplot2::ggtitle(label = "RSeQC, Percent aligned to introns") +
  ggplot2::theme(plot.title = ggplot2::element_text(
    hjust = 0.5,
    size = 18
  ))

summary <- "median"
summ <- df %>%
  dplyr::group_by(x) %>%
  dplyr::summarize(value = stats::median(y))
fun <- stats::median

summ$statY <- max(df$y) + (max(df$y) - min(df$y)) * 0.1
summary <- paste(toupper(substr(summary, 1, 1)),
  substr(summary, 2, nchar(summary)),
  sep = ""
)
summ$label <- paste0(summary, ": ", round(summ$value, 5))

p <- p + ggplot2::geom_text(
  data = summ,
  ggplot2::aes_string(
    x = "x",
    y = "statY",
    label = "label"
  ),
  size = 5
)
p <- p + ggplot2::stat_summary(
  fun = fun, fun.min = fun,
  fun.max = fun,
  geom = "crossbar",
  color = "red",
  linetype = "dashed"
)

p

Percentage aligned to intergenic regions

df <- data.frame(
  x = annotBronch$`Sample Type`,
  y = coldataBronch$RSeQC_tes_down_10kb_tag_pct +
    coldataBronch$RSeQC_tss_up_10kb_tag_pct
)

p <- ggplot2::ggplot(df) +
  ggplot2::aes_string(
    x = "x",
    y = "y"
  )
if (dots == TRUE) {
  p <- p + ggplot2::geom_jitter(
    color = "blue",
    width = 0.2,
    height = 0,
    size = 1,
    alpha = 1
  )
}
if (boxplot == TRUE) {
  p <- p + ggplot2::geom_boxplot(
    width = 0.5,
    alpha = 0
  )
}
if (violin == TRUE) {
  p <- p + ggplot2::geom_violin(
    trim = TRUE,
    scale = "width",
    size = 1,
    fill = "grey",
    alpha = 0.75
  )
}

p <- p + ggplot2::theme_bw() +
  ggplot2::theme(
    panel.grid.major = ggplot2::element_blank(),
    panel.grid.minor = ggplot2::element_blank(),
    axis.text = ggplot2::element_text(size = 10),
    axis.title = ggplot2::element_text(size = 10)
  ) + ggplot2::theme(axis.text.x = ggplot2::element_text(size = 15)) +
  ggplot2::ylab("Aligned to intergenic region (%)") +
  ggplot2::theme(
    axis.title.y = element_text(size = 15),
    axis.title.x = ggplot2::element_blank()
  )

p <- p + ggplot2::ggtitle(label = "RSeQC, Percent aligned to intergenic region") +
  ggplot2::theme(plot.title = ggplot2::element_text(
    hjust = 0.5,
    size = 18
  ))

summary <- "median"
summ <- df %>%
  dplyr::group_by(x) %>%
  dplyr::summarize(value = stats::median(y))
fun <- stats::median

summ$statY <- max(df$y) + (max(df$y) - min(df$y)) * 0.1
summary <- paste(toupper(substr(summary, 1, 1)),
  substr(summary, 2, nchar(summary)),
  sep = ""
)
summ$label <- paste0(summary, ": ", round(summ$value, 5))

p <- p + ggplot2::geom_text(
  data = summ,
  ggplot2::aes_string(
    x = "x",
    y = "statY",
    label = "label"
  ),
  size = 5
)
p <- p + ggplot2::stat_summary(
  fun = fun, fun.min = fun,
  fun.max = fun,
  geom = "crossbar",
  color = "red",
  linetype = "dashed"
)

p

Includes regions upstream of TSS (<10kb) and downstream of TES (<10kb)

Strandedness - Sense

df <- data.frame(
  x = annotBronch$`Sample Type`,
  y = coldataBronch$RSeQC_pe_sense * 100
)

p <- ggplot2::ggplot(df) +
  ggplot2::aes_string(
    x = "x",
    y = "y"
  )
if (dots == TRUE) {
  p <- p + ggplot2::geom_jitter(
    color = "blue",
    width = 0.2,
    height = 0,
    size = 1,
    alpha = 1
  )
}
if (boxplot == TRUE) {
  p <- p + ggplot2::geom_boxplot(
    width = 0.5,
    alpha = 0
  )
}
if (violin == TRUE) {
  p <- p + ggplot2::geom_violin(
    trim = TRUE,
    scale = "width",
    size = 1,
    fill = "grey",
    alpha = 0.75
  )
}

p <- p + ggplot2::theme_bw() +
  ggplot2::theme(
    panel.grid.major = ggplot2::element_blank(),
    panel.grid.minor = ggplot2::element_blank(),
    axis.text = ggplot2::element_text(size = 10),
    axis.title = ggplot2::element_text(size = 10)
  ) + ggplot2::theme(axis.text.x = ggplot2::element_text(size = 15)) +
  ggplot2::ylab("Sense-stranded (%)") +
  ggplot2::theme(
    axis.title.y = element_text(size = 15),
    axis.title.x = ggplot2::element_blank()
  )

p <- p + ggplot2::ggtitle(label = "RSeQC, Percent Sense-strandedness") +
  ggplot2::theme(plot.title = ggplot2::element_text(
    hjust = 0.5,
    size = 18
  ))

summary <- "median"
summ <- df %>%
  dplyr::group_by(x) %>%
  dplyr::summarize(value = stats::median(y))
fun <- stats::median

summ$statY <- max(df$y) + (max(df$y) - min(df$y)) * 0.1
summary <- paste(toupper(substr(summary, 1, 1)),
  substr(summary, 2, nchar(summary)),
  sep = ""
)
summ$label <- paste0(summary, ": ", round(summ$value, 5))

p <- p + ggplot2::geom_text(
  data = summ,
  ggplot2::aes_string(
    x = "x",
    y = "statY",
    label = "label"
  ),
  size = 5
)
p <- p + ggplot2::stat_summary(
  fun = fun, fun.min = fun,
  fun.max = fun,
  geom = "crossbar",
  color = "red",
  linetype = "dashed"
)

p

Strandedness - Antisense

df <- data.frame(
  x = annotBronch$`Sample Type`,
  y = coldataBronch$RSeQC_pe_antisense * 100
)

p <- ggplot2::ggplot(df) +
  ggplot2::aes_string(
    x = "x",
    y = "y"
  )
if (dots == TRUE) {
  p <- p + ggplot2::geom_jitter(
    color = "blue",
    width = 0.2,
    height = 0,
    size = 1,
    alpha = 1
  )
}
if (boxplot == TRUE) {
  p <- p + ggplot2::geom_boxplot(
    width = 0.5,
    alpha = 0
  )
}
if (violin == TRUE) {
  p <- p + ggplot2::geom_violin(
    trim = TRUE,
    scale = "width",
    size = 1,
    fill = "grey",
    alpha = 0.75
  )
}

p <- p + ggplot2::theme_bw() +
  ggplot2::theme(
    panel.grid.major = ggplot2::element_blank(),
    panel.grid.minor = ggplot2::element_blank(),
    axis.text = ggplot2::element_text(size = 10),
    axis.title = ggplot2::element_text(size = 10)
  ) + ggplot2::theme(axis.text.x = ggplot2::element_text(size = 15)) +
  ggplot2::ylab("Antisense-stranded (%)") +
  ggplot2::theme(
    axis.title.y = element_text(size = 15),
    axis.title.x = ggplot2::element_blank()
  )

p <- p + ggplot2::ggtitle(label = "RSeQC, Percent Antisense-strandedness") +
  ggplot2::theme(plot.title = ggplot2::element_text(
    hjust = 0.5,
    size = 18
  ))

summary <- "median"
summ <- df %>%
  dplyr::group_by(x) %>%
  dplyr::summarize(value = stats::median(y))
fun <- stats::median

summ$statY <- max(df$y) + (max(df$y) - min(df$y)) * 0.1
summary <- paste(toupper(substr(summary, 1, 1)),
  substr(summary, 2, nchar(summary)),
  sep = ""
)
summ$label <- paste0(summary, ": ", round(summ$value, 5))

p <- p + ggplot2::geom_text(
  data = summ,
  ggplot2::aes_string(
    x = "x",
    y = "statY",
    label = "label"
  ),
  size = 5
)
p <- p + ggplot2::stat_summary(
  fun = fun, fun.min = fun,
  fun.max = fun,
  geom = "crossbar",
  color = "red",
  linetype = "dashed"
)

p

Proper Pair Percentage

df <- data.frame(
  x = annotBronch$`Sample Type`,
  y = coldataBronch$RSeQC_proper_pairs_percent
)

p <- ggplot2::ggplot(df) +
  ggplot2::aes_string(
    x = "x",
    y = "y"
  )
if (dots == TRUE) {
  p <- p + ggplot2::geom_jitter(
    color = "blue",
    width = 0.2,
    height = 0,
    size = 1,
    alpha = 1
  )
}
if (boxplot == TRUE) {
  p <- p + ggplot2::geom_boxplot(
    width = 0.5,
    alpha = 0
  )
}
if (violin == TRUE) {
  p <- p + ggplot2::geom_violin(
    trim = TRUE,
    scale = "width",
    size = 1,
    fill = "grey",
    alpha = 0.75
  )
}

p <- p + ggplot2::theme_bw() +
  ggplot2::theme(
    panel.grid.major = ggplot2::element_blank(),
    panel.grid.minor = ggplot2::element_blank(),
    axis.text = ggplot2::element_text(size = 10),
    axis.title = ggplot2::element_text(size = 10)
  ) + ggplot2::theme(axis.text.x = ggplot2::element_text(size = 15)) +
  ggplot2::ylab("TIN") +
  ggplot2::theme(
    axis.title.y = element_text(size = 15),
    axis.title.x = ggplot2::element_blank()
  )

p <- p + ggplot2::ggtitle(label = "RSeQC, Proper Pairs (%)") +
  ggplot2::theme(plot.title = ggplot2::element_text(
    hjust = 0.5,
    size = 18
  ))
summary <- "median"
summ <- df %>%
  dplyr::group_by(x) %>%
  dplyr::summarize(value = stats::median(y))
fun <- stats::median

summ$statY <- max(df$y) + (max(df$y) - min(df$y)) * 0.1
summary <- paste(toupper(substr(summary, 1, 1)),
  substr(summary, 2, nchar(summary)),
  sep = ""
)
summ$label <- paste0(summary, ": ", round(summ$value, 5))

p <- p + ggplot2::geom_text(
  data = summ,
  ggplot2::aes_string(
    x = "x",
    y = "statY",
    label = "label"
  ),
  size = 5
)
p <- p + ggplot2::stat_summary(
  fun = fun, fun.min = fun,
  fun.max = fun,
  geom = "crossbar",
  color = "red",
  linetype = "dashed"
)
p

TIN

df <- data.frame(
  x = annotBronch$`Sample Type`,
  y = coldataBronch$RSeQC_TIN_mean
)

p <- ggplot2::ggplot(df) +
  ggplot2::aes_string(
    x = "x",
    y = "y"
  )
if (dots == TRUE) {
  p <- p + ggplot2::geom_jitter(
    color = "blue",
    width = 0.2,
    height = 0,
    size = 1,
    alpha = 1
  )
}
if (boxplot == TRUE) {
  p <- p + ggplot2::geom_boxplot(
    width = 0.5,
    alpha = 0
  )
}
if (violin == TRUE) {
  p <- p + ggplot2::geom_violin(
    trim = TRUE,
    scale = "width",
    size = 1,
    fill = "grey",
    alpha = 0.75
  )
}

p <- p + ggplot2::theme_bw() +
  ggplot2::theme(
    panel.grid.major = ggplot2::element_blank(),
    panel.grid.minor = ggplot2::element_blank(),
    axis.text = ggplot2::element_text(size = 10),
    axis.title = ggplot2::element_text(size = 10)
  ) + ggplot2::theme(axis.text.x = ggplot2::element_text(size = 15)) +
  ggplot2::ylab("TIN") +
  ggplot2::theme(
    axis.title.y = element_text(size = 15),
    axis.title.x = ggplot2::element_blank()
  )

p <- p + ggplot2::ggtitle(label = "RSeQC, TIN") +
  ggplot2::theme(plot.title = ggplot2::element_text(
    hjust = 0.5,
    size = 18
  ))

summary <- "median"
summ <- df %>%
  dplyr::group_by(x) %>%
  dplyr::summarize(value = stats::median(y))
fun <- stats::median

summ$statY <- max(df$y) + (max(df$y) - min(df$y)) * 0.1
summary <- paste(toupper(substr(summary, 1, 1)),
  substr(summary, 2, nchar(summary)),
  sep = ""
)
summ$label <- paste0(summary, ": ", round(summ$value, 5))

p <- p + ggplot2::geom_text(
  data = summ,
  ggplot2::aes_string(
    x = "x",
    y = "statY",
    label = "label"
  ),
  size = 5
)
p <- p + ggplot2::stat_summary(
  fun = fun, fun.min = fun,
  fun.max = fun,
  geom = "crossbar",
  color = "red",
  linetype = "dashed"
)

p

Percentage reads uniquely mapped

df <- data.frame(
  x = annotBronch$`Sample Type`,
  y = coldataBronch$STAR_uniquely_mapped_percent
)

p <- ggplot2::ggplot(df) +
  ggplot2::aes_string(
    x = "x",
    y = "y"
  )
if (dots == TRUE) {
  p <- p + ggplot2::geom_jitter(
    color = "blue",
    width = 0.2,
    height = 0,
    size = 1,
    alpha = 1
  )
}
if (boxplot == TRUE) {
  p <- p + ggplot2::geom_boxplot(
    width = 0.5,
    alpha = 0
  )
}
if (violin == TRUE) {
  p <- p + ggplot2::geom_violin(
    trim = TRUE,
    scale = "width",
    size = 1,
    fill = "grey",
    alpha = 0.75
  )
}

p <- p + ggplot2::theme_bw() +
  ggplot2::theme(
    panel.grid.major = ggplot2::element_blank(),
    panel.grid.minor = ggplot2::element_blank(),
    axis.text = ggplot2::element_text(size = 10),
    axis.title = ggplot2::element_text(size = 10)
  ) + ggplot2::theme(axis.text.x = ggplot2::element_text(size = 15)) +
  ggplot2::ylab("Uniquely mapped to genome(%)") +
  ggplot2::theme(
    axis.title.y = element_text(size = 15),
    axis.title.x = ggplot2::element_blank()
  )

p <- p + ggplot2::ggtitle(label = "STAR, Unique mapped") +
  ggplot2::theme(plot.title = ggplot2::element_text(
    hjust = 0.5,
    size = 18
  ))
summary <- "median"
summ <- df %>%
  dplyr::group_by(x) %>%
  dplyr::summarize(value = stats::median(y))
fun <- stats::median

summ$statY <- max(df$y) + (max(df$y) - min(df$y)) * 0.1
summary <- paste(toupper(substr(summary, 1, 1)),
  substr(summary, 2, nchar(summary)),
  sep = ""
)
summ$label <- paste0(summary, ": ", round(summ$value, 5))

p <- p + ggplot2::geom_text(
  data = summ,
  ggplot2::aes_string(
    x = "x",
    y = "statY",
    label = "label"
  ),
  size = 5
)
p <- p + ggplot2::stat_summary(
  fun = fun, fun.min = fun,
  fun.max = fun,
  geom = "crossbar",
  color = "red",
  linetype = "dashed"
)

p

Percentage reads unmapped

df <- data.frame(
  x = annotBronch$`Sample Type`,
  y = coldataBronch$STAR_unmapped_mismatches_percent +
    coldataBronch$STAR_unmapped_tooshort_percent +
    coldataBronch$STAR_unmapped_other_percent
)

p <- ggplot2::ggplot(df) +
  ggplot2::aes_string(
    x = "x",
    y = "y"
  )
if (dots == TRUE) {
  p <- p + ggplot2::geom_jitter(
    color = "blue",
    width = 0.2,
    height = 0,
    size = 1,
    alpha = 1
  )
}
if (boxplot == TRUE) {
  p <- p + ggplot2::geom_boxplot(
    width = 0.5,
    alpha = 0
  )
}
if (violin == TRUE) {
  p <- p + ggplot2::geom_violin(
    trim = TRUE,
    scale = "width",
    size = 1,
    fill = "grey",
    alpha = 0.75
  )
}

p <- p + ggplot2::theme_bw() +
  ggplot2::theme(
    panel.grid.major = ggplot2::element_blank(),
    panel.grid.minor = ggplot2::element_blank(),
    axis.text = ggplot2::element_text(size = 10),
    axis.title = ggplot2::element_text(size = 10)
  ) + ggplot2::theme(axis.text.x = ggplot2::element_text(size = 15)) +
  ggplot2::ylab("Unmapped to genome(%)") +
  ggplot2::theme(
    axis.title.y = element_text(size = 15),
    axis.title.x = ggplot2::element_blank()
  )

p <- p + ggplot2::ggtitle(label = "STAR, Percent unmapped") +
  ggplot2::theme(plot.title = ggplot2::element_text(
    hjust = 0.5,
    size = 18
  ))
summary <- "median"
summ <- df %>%
  dplyr::group_by(x) %>%
  dplyr::summarize(value = stats::median(y))
fun <- stats::median

summ$statY <- max(df$y) + (max(df$y) - min(df$y)) * 0.1
summary <- paste(toupper(substr(summary, 1, 1)),
  substr(summary, 2, nchar(summary)),
  sep = ""
)
summ$label <- paste0(summary, ": ", round(summ$value, 5))

p <- p + ggplot2::geom_text(
  data = summ,
  ggplot2::aes_string(
    x = "x",
    y = "statY",
    label = "label"
  ),
  size = 5
)
p <- p + ggplot2::stat_summary(
  fun = fun, fun.min = fun,
  fun.max = fun,
  geom = "crossbar",
  color = "red",
  linetype = "dashed"
)
p

Total reads

df <- data.frame(
  x = annotBronch$`Sample Type`,
  y = coldataBronch$STAR_total_reads
)

p <- ggplot2::ggplot(df) +
  ggplot2::aes_string(
    x = "x",
    y = "y"
  )
if (dots == TRUE) {
  p <- p + ggplot2::geom_jitter(
    color = "blue",
    width = 0.2,
    height = 0,
    size = 1,
    alpha = 1
  )
}
if (boxplot == TRUE) {
  p <- p + ggplot2::geom_boxplot(
    width = 0.5,
    alpha = 0
  )
}
if (violin == TRUE) {
  p <- p + ggplot2::geom_violin(
    trim = TRUE,
    scale = "width",
    size = 1,
    fill = "grey",
    alpha = 0.75
  )
}

p <- p + ggplot2::theme_bw() +
  ggplot2::theme(
    panel.grid.major = ggplot2::element_blank(),
    panel.grid.minor = ggplot2::element_blank(),
    axis.text = ggplot2::element_text(size = 10),
    axis.title = ggplot2::element_text(size = 10)
  ) + ggplot2::theme(axis.text.x = ggplot2::element_text(size = 15)) +
  ggplot2::ylab("Total Number of Reads") +
  ggplot2::theme(
    axis.title.y = element_text(size = 15),
    axis.title.x = ggplot2::element_blank()
  )

p <- p + ggplot2::ggtitle(label = "STAR, Total reads") +
  ggplot2::theme(plot.title = ggplot2::element_text(
    hjust = 0.5,
    size = 18
  ))
summary <- "median"
summ <- df %>%
  dplyr::group_by(x) %>%
  dplyr::summarize(value = stats::median(y))
fun <- stats::median

summ$statY <- max(df$y) + (max(df$y) - min(df$y)) * 0.1
summary <- paste(toupper(substr(summary, 1, 1)),
  substr(summary, 2, nchar(summary)),
  sep = ""
)
summ$label <- paste0(summary, ": ", round(summ$value, 5))

p <- p + ggplot2::geom_text(
  data = summ,
  ggplot2::aes_string(
    x = "x",
    y = "statY",
    label = "label"
  ),
  size = 5
)
p <- p + ggplot2::stat_summary(
  fun = fun, fun.min = fun,
  fun.max = fun,
  geom = "crossbar",
  color = "red",
  linetype = "dashed"
)
p

Mismatch rate

df <- data.frame(
  x = annotBronch$`Sample Type`,
  y = coldataBronch$STAR_mismatch_rate
)

p <- ggplot2::ggplot(df) +
  ggplot2::aes_string(
    x = "x",
    y = "y"
  )
if (dots == TRUE) {
  p <- p + ggplot2::geom_jitter(
    color = "blue",
    width = 0.2,
    height = 0,
    size = 1,
    alpha = 1
  )
}
if (boxplot == TRUE) {
  p <- p + ggplot2::geom_boxplot(
    width = 0.5,
    alpha = 0
  )
}
if (violin == TRUE) {
  p <- p + ggplot2::geom_violin(
    trim = TRUE,
    scale = "width",
    size = 1,
    fill = "grey",
    alpha = 0.75
  )
}

p <- p + ggplot2::theme_bw() +
  ggplot2::theme(
    panel.grid.major = ggplot2::element_blank(),
    panel.grid.minor = ggplot2::element_blank(),
    axis.text = ggplot2::element_text(size = 10),
    axis.title = ggplot2::element_text(size = 10)
  ) + ggplot2::theme(axis.text.x = ggplot2::element_text(size = 15)) +
  ggplot2::ylab("Mismatch rate") +
  ggplot2::theme(
    axis.title.y = element_text(size = 15),
    axis.title.x = ggplot2::element_blank()
  )

p <- p + ggplot2::ggtitle(label = "STAR, mismatch rate") +
  ggplot2::theme(plot.title = ggplot2::element_text(
    hjust = 0.5,
    size = 18
  ))
summary <- "median"
summ <- df %>%
  dplyr::group_by(x) %>%
  dplyr::summarize(value = stats::median(y))
fun <- stats::median

summ$statY <- max(df$y) + (max(df$y) - min(df$y)) * 0.1
summary <- paste(toupper(substr(summary, 1, 1)),
  substr(summary, 2, nchar(summary)),
  sep = ""
)
summ$label <- paste0(summary, ": ", round(summ$value, 5))

p <- p + ggplot2::geom_text(
  data = summ,
  ggplot2::aes_string(
    x = "x",
    y = "statY",
    label = "label"
  ),
  size = 5
)
p <- p + ggplot2::stat_summary(
  fun = fun, fun.min = fun,
  fun.max = fun,
  geom = "crossbar",
  color = "red",
  linetype = "dashed"
)
p

Histogram, QC outliers

A tally of how many times a sample had an outlier for a numeric QC output was taken. An outlier was considered to be a value past 2 standard deviations of the mean. The barplot below is colored by whether the sample RIN was higher or lower than 5, to determine if RIN scores had a correlation to the QC results.

fastqc.ix <- grep("FastQC", colnames(coldataBronch))
rseqc.ix <- grep("RSeQC", colnames(coldataBronch))
star.ix <- grep("STAR", colnames(coldataBronch))

colDataBronchSubset <- coldataBronch[, c(fastqc.ix, rseqc.ix, star.ix)]

# Bronch samples flagged by Novartis
flaggedByNovartisBronch <- flaggedByNovartis[flaggedByNovartis %in% rownames(colDataBronchSubset)]
novartisQC <- sapply(rownames(colDataBronchSubset), function(x) {
  if (x %in% flaggedByNovartisBronch) {
    return("Poor")
  } else {
    return("Good")
  }
})

# FastQC outputs lists instead of numeric/integer, because it is looking
# at pairs of fastqs.
classlist <- c()
for (x in 1:ncol(colDataBronchSubset)) {
  classlist <- c(classlist, class(colDataBronchSubset[1, x]))
}

# Of FastQC outputs, only taking out average seq length, total sequences,
# dedup percentage, percentGC. Other metrics seem to be passing for all data points
colDataBronchSubsetFastq <- colDataBronchSubset[, c(4, 12, 16, 17)]
colDataBronchSubsetFastq2 <- apply(colDataBronchSubsetFastq, 1:2, function(x) {
  return(mean(unlist(x)))
})
colDataBronchSubset <- colDataBronchSubset[, -which(classlist == "list")]
colDataBronchSubset <- cbind(colDataBronchSubset, colDataBronchSubsetFastq2)

# Also removing all QC metrics which have the same values across all samples
rm.ix.qc <- c()
for (x in 1:ncol(colDataBronchSubset)) {
  if (length(unique(colDataBronchSubset[, x])) == 1) {
    rm.ix.qc <- c(rm.ix.qc, x)
  }
}

colDataBronchSubset <- colDataBronchSubset[, -rm.ix.qc]

qc.tally <- matrix(ncol = nrow(colDataBronchSubset), data = 0)
colnames(qc.tally) <- rownames(colDataBronchSubset)

qc.tally2 <- matrix(
  ncol = nrow(colDataBronchSubset),
  nrow = ncol(colDataBronchSubset),
  data = 0
)

rownames(qc.tally2) <- colnames(colDataBronchSubset)
colnames(qc.tally2) <- rownames(colDataBronchSubset)

isNumeric <- 0
for (i in 1:ncol(colDataBronchSubset)) {
  qc <- colDataBronchSubset[, i]
  if (class(qc) == "numeric" | class(qc) == "integer") {
    isNumeric <- isNumeric + 1
    qc.filt <- qc[!is.na(qc)]
    mean.qc <- mean(qc.filt)
    sd.qc <- sd(qc.filt)
    qc.tally[, which(qc > mean.qc + 2 * sd.qc | qc < mean.qc - 2 * sd.qc)] <- qc.tally[, which(qc > mean.qc + 2 * sd.qc | qc < mean.qc - 2 * sd.qc)] + 1

    qc.tally2[i, which(qc > mean.qc + 2 * sd.qc | qc < mean.qc - 2 * sd.qc)] <- qc.tally2[i, which(qc > mean.qc + 2 * sd.qc | qc < mean.qc - 2 * sd.qc)] + 1
  }
}

qc.tally.gg <- as.data.frame(t(qc.tally))
qc.tally.gg$RIN <- annotBronch$RIN < 5
qc.tally.gg$RIN[qc.tally.gg$RIN == TRUE] <- "Lower than 5"
qc.tally.gg$RIN[qc.tally.gg$RIN == FALSE] <- "Higher than 5"

histQC <- ggplot(data = qc.tally.gg, aes(x = V1, color = RIN, fill = RIN)) +
  geom_histogram() +
  labs(x = "Tally of outliers") +
  ggplot2::theme_bw() +
  ggplot2::theme(
    panel.grid.major = ggplot2::element_blank(),
    panel.grid.minor = ggplot2::element_blank(),
    axis.text = ggplot2::element_text(size = 15),
    axis.title = ggplot2::element_text(size = 15)
  )

histQC

# rowSums(qc.tally2[,colSums(qc.tally2) > 30])

qc.tally.gg$Number_Outliers_Binned <- qc.tally.gg$V1 > 25

qc.tally.gg$Number_Outliers_Binned <- as.character(qc.tally.gg$Number_Outliers_Binned)
qc.tally.gg$Number_Outliers_Binned <- gsub("FALSE", "x < 25",
                                           qc.tally.gg$Number_Outliers_Binned)
qc.tally.gg$Number_Outliers_Binned <- gsub("TRUE", "x => 25",
                                           qc.tally.gg$Number_Outliers_Binned)

highAmountOutlier <- colnames(qc.tally)[which(qc.tally > 25)]

qualAnnot <- sapply(qc.tally, function(x) {
  if (x < 25) {
    return("x =< 25")
  } else if (x < 40) {
    return("25 < x < 40")
  } else {
    return("x >= 40")
  }
})

Number_Outliers_Binned <- as.factor(qualAnnot)
Number_Outliers_Binned <- factor(Number_Outliers_Binned, levels = c("x =< 25",
                                    "25 < x < 40",
                                    "x >= 40"))

A total of 96 QC metrics were taken into consideration. The following samples had a high number of QC metrics outliers: EM5-100, EM8-112, EM8-130, EM8-26, EM8-66, N1-115, N1-25, N1-55, N1-63, N1-67, N1-81, N1-83, N1-84, N1-94, N2-15, N2-31

The barplots were also annotated by the QC distinctions Novartis had previously made. (Poor vs Good)

qc.tally.gg$NovartisQC <- novartisQC
histQC2 <- ggplot(data = qc.tally.gg, aes(x = V1, color = NovartisQC, fill = NovartisQC)) +
  geom_histogram() +
  labs(x = "Tally of outliers") +
  ggplot2::theme_bw() +
  ggplot2::theme(
    panel.grid.major = ggplot2::element_blank(),
    panel.grid.minor = ggplot2::element_blank(),
    axis.text = ggplot2::element_text(size = 15),
    axis.title = ggplot2::element_text(size = 15)
  ) +
  scale_fill_manual(values = c("Red", "Blue"))

histQC2

Using Proper pairs percent

Uniquely mapped vs Proper Pairs Percent

ggplot(as.data.frame(coldataBronch), aes(x = STAR_uniquely_mapped, y = RSeQC_proper_pairs_percent, color = Number_Outliers_Binned)) +
  geom_point(size = 0.75) +
  ggplot2::theme_bw() +
  ggplot2::theme(
    panel.grid.major = ggplot2::element_blank(),
    panel.grid.minor = ggplot2::element_blank(),
    axis.text = ggplot2::element_text(size = 14),
    axis.title = ggplot2::element_text(size = 14)
  ) +
  ggplot2::xlab(paste0("STAR Uniquely mapped")) +
  ggplot2::ylab(paste0("RSeQC Proper Pairs (%)")) +
  ggplot2::theme(legend.title = ggplot2::element_text(size = 15), legend.text = ggplot2::element_text(size = 13)) +
  ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 2)))

Proper Pairs Percent vs TIN median

ggplot(as.data.frame(coldataBronch), aes(x = RSeQC_proper_pairs_percent, y = RSeQC_TIN_median, color = Number_Outliers_Binned)) +
  geom_point(size = 0.75) +
  ggplot2::theme_bw() +
  ggplot2::theme(
    panel.grid.major = ggplot2::element_blank(),
    panel.grid.minor = ggplot2::element_blank(),
    axis.text = ggplot2::element_text(size = 14),
    axis.title = ggplot2::element_text(size = 14)
  ) +
  ggplot2::xlab(paste0("RSeQC Proper Pairs (%)")) +
  ggplot2::ylab(paste0("RSeQC TIN median")) +
  ggplot2::theme(legend.title = ggplot2::element_text(size = 15), legend.text = ggplot2::element_text(size = 13)) +
  ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 2)))

PCA, QC outliers

We used 96 QC metrics and computed the PCA on z-scored values. We subsequently ran Pearson correlation test on PC 1-10 and the z-score of each QC metric to calculate which principal component each QC metric was most strongly associated to.

Outliers

qcMetrics <- colDataBronchSubset

qcMetrics.z <- t(scale(qcMetrics))
qcMetrics.z.trim <- qcMetrics.z

qcMetrics.z.trim[qcMetrics.z < -2] <- -2
qcMetrics.z.trim[qcMetrics.z > 2] <- 2


RIN <- annotBronch$RIN
Number_Outliers_Binned <- as.factor(qualAnnot)
Number_Outliers_Binned <- factor(Number_Outliers_Binned, levels = c("x =< 25",
                                    "25 < x < 40",
                                    "x >= 40"))

Novartis_QC <- novartisQC

z <- qcMetrics.z

# run PCA
pc <- prcomp(z, center = F, scale = F)

forpcaBronch <- as.data.frame(pc$rotation[, 1:3])

forpcaBronch$Number_Outliers_Binned <- Number_Outliers_Binned

spc <- summary(pc)
pc1Importance <- signif(spc$importance[2, 1] * 100, 3)
pc2Importance <- signif(spc$importance[2, 2] * 100, 3)

# pdf("~/PCA_Batch.pdf", width = 9)
ggplot(forpcaBronch, aes(x = PC1, y = PC2, color = Number_Outliers_Binned)) +
  geom_point(size = 0.75) +
  ggplot2::theme_bw() +
  ggplot2::theme(
    panel.grid.major = ggplot2::element_blank(),
    panel.grid.minor = ggplot2::element_blank(),
    axis.text = ggplot2::element_text(size = 14),
    axis.title = ggplot2::element_text(size = 14)
  ) +
  ggplot2::xlab(paste0("PC1 (", pc1Importance, "%)")) +
  ggplot2::ylab(paste0("PC2 (", pc2Importance, "%)")) +
  ggplot2::theme(legend.title = ggplot2::element_text(size = 15), legend.text = ggplot2::element_text(size = 13)) +
  ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 2)))

cor1 <- cor(pc$rotation[,1:10], t(qcMetrics.z), method = "pearson")

cor1Res <- apply(cor1, 2, function(x){
    return(names(which.max(abs(x))))
})

uniquePCs <- unique(cor1Res)[order(unique(cor1Res))]
qcpc.list = list()
for(x in 1:length(uniquePCs)){
  pc <- uniquePCs[x]
  qcpc.list[[pc]] = names(cor1Res)[cor1Res == pc]
}

QCPC.table <- plyr::ldply(qcpc.list, cbind)
colnames(QCPC.table) <- c("PC", "Metric")

QCPC.table %>%
  knitr::kable(
    format = "html", align = rep("c", ncol(QCPC.table) + 1),
    row.names = TRUE
  ) %>%
  kableExtra::kable_styling() %>%
  kableExtra::scroll_box(height = "250px")
PC Metric
1 PC1 RSeQC_TIN_mean
2 PC1 RSeQC_TIN_median
3 PC1 RSeQC_splice_reads
4 PC1 RSeQC_read_2
5 PC1 RSeQC_read_1
6 PC1 RSeQC_mapq_gte_mapq_cut_unique
7 PC1 RSeQC_reads_mapped_in_proper_pairs
8 PC1 RSeQC_mapq_lt_mapq_cut_non_unique
9 PC1 RSeQC_non_splice_reads
10 PC1 RSeQC_proper_pairs_percent
11 PC1 RSeQC_reads_map_to_antisense
12 PC1 RSeQC_total_records
13 PC1 RSeQC_non_primary_hits
14 PC1 RSeQC_unique_percent
15 PC1 RSeQC_reads_map_to_sense
16 PC1 RSeQC_total_splicing_events
17 PC1 RSeQC_novel_splicing_junctions
18 PC1 RSeQC_partial_novel_splicing_events_pct
19 PC1 RSeQC_partial_novel_splicing_junctions
20 PC1 RSeQC_known_splicing_junctions
21 PC1 RSeQC_novel_splicing_events_pct
22 PC1 RSeQC_total_splicing_junctions
23 PC1 RSeQC_partial_novel_splicing_junctions_pct
24 PC1 RSeQC_known_splicing_junctions_pct
25 PC1 RSeQC_novel_splicing_events
26 PC1 RSeQC_known_splicing_events
27 PC1 RSeQC_partial_novel_splicing_events
28 PC1 RSeQC_known_splicing_events_pct
29 PC1 RSeQC_tes_down_1kb_tags_kb
30 PC1 RSeQC_cds_exons_tag_count
31 PC1 RSeQC_5_utr_exons_tags_kb
32 PC1 RSeQC_tss_up_5kb_tag_count
33 PC1 RSeQC_tss_up_1kb_tag_pct
34 PC1 RSeQC_3_utr_exons_tags_kb
35 PC1 RSeQC_3_utr_exons_tag_pct
36 PC1 RSeQC_5_utr_exons_tag_count
37 PC1 RSeQC_tes_down_1kb_tag_count
38 PC1 RSeQC_total_assigned_tags
39 PC1 RSeQC_3_utr_exons_tag_count
40 PC1 RSeQC_tss_up_1kb_tag_count
41 PC1 RSeQC_other_intergenic_tag_count
42 PC1 RSeQC_introns_tag_count
43 PC1 RSeQC_cds_exons_tag_pct
44 PC1 RSeQC_tss_up_10kb_tag_pct
45 PC1 RSeQC_tss_up_1kb_tags_kb
46 PC1 RSeQC_5_utr_exons_tag_pct
47 PC1 RSeQC_tss_up_5kb_tag_pct
48 PC1 RSeQC_cds_exons_tags_kb
49 PC1 RSeQC_other_intergenic_tag_pct
50 PC1 RSeQC_tss_up_10kb_tags_kb
51 PC1 RSeQC_introns_tags_kb
52 PC1 RSeQC_tes_down_1kb_tag_pct
53 PC1 RSeQC_tss_up_10kb_tag_count
54 PC1 RSeQC_introns_tag_pct
55 PC1 RSeQC_tss_up_5kb_tags_kb
56 PC1 STAR_uniquely_mapped_percent
57 PC1 STAR_num_splices
58 PC1 STAR_num_GCAG_splices
59 PC1 STAR_deletion_length
60 PC1 STAR_avg_mapped_read_length
61 PC1 STAR_mismatch_rate
62 PC1 STAR_avg_input_read_length
63 PC1 STAR_num_ATAC_splices
64 PC1 STAR_num_annotated_splices
65 PC1 STAR_num_GTAG_splices
66 PC1 STAR_uniquely_mapped
67 PC1 STAR_multimapped_toomany
68 PC1 STAR_unmapped_other
69 PC1 STAR_insertion_rate
70 PC1 STAR_unmapped_other_percent
71 PC1 STAR_multimapped_percent
72 PC1 STAR_multimapped
73 PC1 STAR_num_noncanonical_splices
74 PC1 STAR_multimapped_toomany_percent
75 PC1 FastQC_avg_sequence_length
76 PC1 FastQC_percent_GC
77 PC1 FastQC_total_deduplicated_percentage
78 PC2 RSeQC_novel_splicing_junctions_pct
79 PC2 RSeQC_tes_down_5kb_tag_pct
80 PC2 RSeQC_tes_down_5kb_tags_kb
81 PC2 RSeQC_tes_down_10kb_tag_pct
82 PC2 RSeQC_tes_down_5kb_tag_count
83 PC2 RSeQC_tes_down_10kb_tag_count
84 PC2 RSeQC_tes_down_10kb_tags_kb
85 PC3 RSeQC_total_reads
86 PC3 RSeQC_total_tags
87 PC3 STAR_unmapped_tooshort_percent
88 PC3 STAR_total_reads
89 PC3 FastQC_Total_Sequences
90 PC4 RSeQC_unmapped_reads
91 PC4 STAR_unmapped_tooshort
92 PC5 RSeQC_failed
93 PC5 RSeQC_pe_sense
94 PC5 RSeQC_pe_antisense
95 PC7 RSeQC_TIN_stdev
96 PC7 STAR_insertion_length

Of all of the QC metrics, 77 were most associated, either positively or negatively, to PC1.

Outliers, Novartis

forpcaBronch$Quality <- Novartis_QC

# pdf("~/PCA_Batch.pdf", width = 9)
ggplot(forpcaBronch, aes(x = PC1, y = PC2, color = Quality)) +
  geom_point(size = 0.75) +
  ggplot2::theme_bw() +
  ggplot2::theme(
    panel.grid.major = ggplot2::element_blank(),
    panel.grid.minor = ggplot2::element_blank(),
    axis.text = ggplot2::element_text(size = 14),
    axis.title = ggplot2::element_text(size = 14)
  ) +
  ggplot2::xlab(paste0("PC1 (", pc1Importance, "%)")) +
  ggplot2::ylab(paste0("PC2 (", pc2Importance, "%)")) +
  ggplot2::theme(legend.title = ggplot2::element_text(size = 15), legend.text = ggplot2::element_text(size = 13)) +
  ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 2)))

Heatmap, QC outliers

The numeric QC metric values were z-scored across samples, plotted by heatmap. A total of 96 QC metrics were used.

RIN <- annotBronch$RIN
Number_Outliers_Binned <- as.factor(qualAnnot)
Number_Outliers_Binned <- factor(Number_Outliers_Binned, levels = c("x =< 25",
                                    "25 < x < 40",
                                    "x >= 40"))

Novartis_QC <- novartisQC
annotation_col <- data.frame(RIN, Number_Outliers_Binned, Novartis_QC)

rownames(annotation_col) <- colnames(qcMetrics.z.trim)

annotation_row <- data.frame(Correlated_PC = cor1Res)
rownames(annotation_row) <- rownames(qcMetrics.z.trim)

callback <- function(hc, mat) {
  sv <- svd(t(mat))$v[, 1]
  dend <- reorder(as.dendrogram(hc), wts = sv)
  as.hclust(dend)
}

out <- pheatmap(qcMetrics.z.trim,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(100),
  annotation_col = annotation_col,
  annotation_row = annotation_row,
  legend = T,
  show_rownames = F,
  show_colnames = F,
  fontsize_row = 9,
  cluster_rows = TRUE,
  cluster_cols = TRUE,
  clustering_callback = callback
)

out

##Added 10/26/20. Used cutree to determine which samples to remove

clusterLabels <- sort(cutree(out$tree_col, k=2))
write.table(x = names(clusterLabels[clusterLabels == 2]),
        file = "20201026_PoorQuality_Bronch.txt", sep = "\t")

removed <- names(clusterLabels[clusterLabels == 2])

We split the dendrogram into 2 “branches” and considered all samples within the branch enriched for poor QC metrics to be of poor quality. These 27 samples are the following: EM5-100, EM8-112, EM8-130, EM8-158, EM8-171, EM8-174, EM8-194, EM8-26, EM8-55, EM8-60, EM8-66, N1-115, N1-16, N1-25, N1-40, N1-43, N1-55, N1-63, N1-64, N1-67, N1-81, N1-83, N1-84, N1-94, N2-10, N2-15, N2-31

PCA, Batch Effect

PCA was conducted on TPM counts. No evidence of batch effect can be seen in the PCA plot.

PC1 vs PC2

counts <- assays(seBronch)[[2]]

counts <- counts[rowSums(counts) > 0, ]

# z normalize
z <- t(scale(t(counts)))
# run PCA
pc <- prcomp(z, center = F, scale = F)

forpcaBronch <- as.data.frame(pc$rotation[, 1:3])

forpcaBronch$Batch <- annotBronch$Batch

spc <- summary(pc)
pc1Importance <- signif(spc$importance[2, 1] * 100, 3)
pc2Importance <- signif(spc$importance[2, 2] * 100, 3)
pc3Importance <- signif(spc$importance[2, 3] * 100, 3)

# pdf("~/PCA_Batch.pdf", width = 9)
ggplot(forpcaBronch, aes(x = PC1, y = PC2, color = Batch)) +
  geom_point(size = 0.75) +
  ggplot2::theme_bw() +
  ggplot2::theme(
    panel.grid.major = ggplot2::element_blank(),
    panel.grid.minor = ggplot2::element_blank(),
    axis.text = ggplot2::element_text(size = 14),
    axis.title = ggplot2::element_text(size = 14)
  )  +
  ggplot2::xlab(paste0("PC1 (", pc1Importance, "%)")) +
  ggplot2::ylab(paste0("PC2 (", pc2Importance, "%)")) +
  ggplot2::theme(legend.title = ggplot2::element_text(size = 15), legend.text = ggplot2::element_text(size = 13)) +
  ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 2)))

# dev.off()

PC1 vs PC3

# PCA: PC1 vs PC3
ggplot(forpcaBronch, aes(x = PC1, y = PC3, color = Batch)) +
  geom_point(size = 0.75) +
  ggplot2::theme_bw() +
  ggplot2::theme(
    panel.grid.major = ggplot2::element_blank(),
    panel.grid.minor = ggplot2::element_blank(),
    axis.text = ggplot2::element_text(size = 14),
    axis.title = ggplot2::element_text(size = 14)
  )  +
  ggplot2::xlab(paste0("PC1 (", pc1Importance, "%)")) +
  ggplot2::ylab(paste0("PC3 (", pc3Importance, "%)")) +
  ggplot2::theme(legend.title = ggplot2::element_text(size = 15), legend.text = ggplot2::element_text(size = 13)) +
  ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 2)))

QC outliers

forpcaBronch <- cbind(forpcaBronch, qc.tally.gg)
ggplot(forpcaBronch, aes(x = PC1, y = PC2, color = Number_Outliers_Binned)) +
  geom_point(size = 0.75) +
  geom_point(
    data = subset(forpcaBronch, Number_Outliers_Binned == "x => 25"),
    aes(x = PC1, y = PC2, color = Number_Outliers_Binned)
  ) +
  ggplot2::theme_bw() +
  ggplot2::theme(
    panel.grid.major = ggplot2::element_blank(),
    panel.grid.minor = ggplot2::element_blank(),
    axis.text = ggplot2::element_text(size = 14),
    axis.title = ggplot2::element_text(size = 14)
  )  +
  ggplot2::xlab(paste0("PC1 (", pc1Importance, "%)")) +
  ggplot2::ylab(paste0("PC2 (", pc2Importance, "%)")) +
  ggplot2::theme(legend.title = ggplot2::element_text(size = 15), legend.text = ggplot2::element_text(size = 13)) +
  ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 2)))

Sex status

sex.genes <- c("KDM5D", "UTY", "DDX3Y", "USP9Y", "TXLNG2P", "RPS4Y1", "XIST")
sex.genes.id <- match(sex.genes, rowRanges(seBronch)$external_gene_id)

Gender <- annotBronch$PERSON_GENDER
Sample_Type <- annotBronch$`Sample Type`
Gender <- gsub(" Gender", "", Gender)
annotation_col <- data.frame(Gender, Sample_Type)

counts <- assays(seBronch)$TPM
rownames(counts) <- rowRanges(seBronch)$external_gene_id

genes1 <- counts[sex.genes.id, ]

genes1.z <- t(scale(t(genes1)))
genes1.z.trim <- genes1.z
genes1.z.trim[genes1.z < -2] <- -2
genes1.z.trim[genes1.z > 2] <- 2

rownames(annotation_col) <- colnames(genes1.z.trim)

callback <- function(hc, mat) {
  sv <- svd(t(mat))$v[, 1]
  dend <- reorder(as.dendrogram(hc), wts = sv)
  as.hclust(dend)
}

out <- pheatmap(genes1.z.trim,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(100),
  annotation_col = annotation_col,
  legend = T,
  show_rownames = T,
  show_colnames = F,
  fontsize_row = 9,
  cluster_rows = FALSE,
  cluster_cols = TRUE,
  clustering_callback = callback
)

exprs.male <- which(genes1.z.trim["KDM5D", ] > 1)
record.female <- which(annotBronch$PERSON_GENDER == "Female Gender")
sex.discrepancy <- annotBronch$`Kit Number`[intersect(exprs.male, record.female)]

out

The patient expressing male sex genes, but annotated female, is 4438-095

Demographics Comparison

General annotation data of patients were compared between those of which samples were removed due to outliers, and those that were retained.

seBronchQC <- seBronch[, !colnames(seBronch) %in% highAmountOutlier]
annotBronchQC <- annotBronch[!annotBronch$`RNA Sample ID` %in% highAmountOutlier, ]

seBronchQCOut <- seBronch[, colnames(seBronch) %in% highAmountOutlier]
annotBronchQCOut <- annotBronch[annotBronch$`RNA Sample ID` %in% highAmountOutlier, ]

lab <- c(
  rep("Retained", length(annotBronchQC$GRPB_SMK_STS)),
  rep("Removed", length(annotBronchQCOut$GRPB_SMK_STS))
)
# AGE
mean.age <- mean(annotBronchQC$AGE)
sd.age <- sd(annotBronchQC$AGE)

age <- paste0(signif(mean.age, 3), " +/- ", signif(sd.age, 3))

mean.age.rm <- mean(annotBronchQCOut$AGE)
sd.age.rm <- sd(annotBronchQCOut$AGE)

age.rm <- paste0(signif(mean.age.rm, 3), " +/- ", signif(sd.age.rm, 3))

age.pvalue <- signif(wilcox.test(annotBronchQC$AGE, annotBronchQCOut$AGE)$p.value, 3)

demo.table <- c(age, age.rm, age.pvalue)

# Gender
gstatus <- c(annotBronchQC$PERSON_GENDER, annotBronchQCOut$PERSON_GENDER)

male.ct <- sum(annotBronchQC$PERSON_GENDER == "Male Gender")
male.pct <- signif(male.ct / length(annotBronchQC$PERSON_GENDER), 3) * 100
male <- paste0("Male: ", male.ct, "(", male.pct, "%)")

male.ct.rm <- sum(annotBronchQCOut$PERSON_GENDER == "Male Gender")
male.pct.rm <- signif(male.ct.rm / length(annotBronchQCOut$PERSON_GENDER), 3) * 100
male.rm <- paste0("Male: ", male.ct.rm, "(", male.pct.rm, "%)")

gender.pvalue <- signif(chisq.test(table(lab, gstatus))$p.value, 3)

demo.table <- rbind(demo.table, c(male, male.rm, gender.pvalue))

# FEVFVC
mean.fevfvc <- mean(annotBronchQC$FEV_FVC)
sd.fevfvc <- sd(annotBronchQC$FEV_FVC)

fevfvc <- paste0(signif(mean.fevfvc, 3), " +/- ", signif(sd.fevfvc, 3))

mean.fevfvc.rm <- mean(annotBronchQCOut$FEV_FVC)
sd.fevfvc.rm <- sd(annotBronchQCOut$FEV_FVC)

fevfvc.rm <- paste0(signif(mean.fevfvc.rm, 3), " +/- ", signif(sd.fevfvc.rm, 3))

fevfvc.pvalue <- signif(wilcox.test(annotBronchQC$FEV_FVC, annotBronchQCOut$FEV_FVC)$p.value, 3)

demo.table <- rbind(demo.table, c(fevfvc, fevfvc.rm, fevfvc.pvalue))

# Smoking Status
smk <- c(annotBronchQC$GRPB_SMK_STS, annotBronchQCOut$GRPB_SMK_STS)

annotBronchQC$GRPB_SMK_STS <- gsub(" ", "_", annotBronchQC$GRPB_SMK_STS)
annotBronchQCOut$GRPB_SMK_STS <- gsub(" ", "_", annotBronchQCOut$GRPB_SMK_STS)

smkstatus.na <- sum(is.na(annotBronchQC$GRPB_SMK_STS == "Current_Smoker"))

smkstatus.ct <- sum(annotBronchQC[!is.na(annotBronchQC$GRPB_SMK_STS), ]$GRPB_SMK_STS == "Current_Smoker")
smkstatus.pct <- signif(smkstatus.ct / nrow(annotBronchQC[!is.na(annotBronchQC$GRPB_SMK_STS), ]), 3) * 100
smkstatus <- paste0("Current: ", smkstatus.ct, "(", smkstatus.pct, "%)")

smkstatus.ct.rm <- sum(annotBronchQCOut$GRPB_SMK_STS == "Current_Smoker")
smkstatus.pct.rm <- signif(smkstatus.ct.rm / length(annotBronchQCOut$GRPB_SMK_STS), 3) * 100
smkstatus.rm <- paste0("Current: ", smkstatus.ct.rm, "(", smkstatus.pct.rm, "%)")

smkstatus.pvalue <- signif(chisq.test(table(lab, smk))$p.value, 3)

demo.table <- rbind(demo.table, c(smkstatus, smkstatus.rm, smkstatus.pvalue))

rownames(demo.table) <- c("Age", "Gender", "FEV1/FVC", "Smoking Status")
colnames(demo.table) <- c("Retained", "Removed", "p-value")

demo.table %>%
  knitr::kable(
    format = "html", align = rep("c", ncol(demo.table) + 1),
    row.names = TRUE
  ) %>%
  kableExtra::kable_styling()
Retained Removed p-value
Age 63.5 +/- 6.1 61.1 +/- 6.27 0.239
Gender Male: 146(79.3%) Male: 14(87.5%) 0.648
FEV1/FVC 0.635 +/- 0.117 0.686 +/- 0.111 0.0535
Smoking Status Current: 66(36.3%) Current: 11(68.8%) 0.0221

Differential Expression, Smoking status

DESeq2 was run (Model: ~ Smoking Status) on both expression data prior to and after filtering possible outliers.

Pre-filter

counts <- assay(seBronch)
rownames(counts) <- rowRanges(seBronch)$external_gene_id

counts <- apply(counts, 1:2, function(x) {
  return(as.integer(x))
})

no.smk.status.ix <- which(is.na(annotBronch$GRPB_SMK_STS))

counts <- counts[, -no.smk.status.ix]
annotBronch <- annotBronch[-no.smk.status.ix, ]
annotBronch$GRPB_SMK_STS <- gsub(" ", "_", annotBronch$GRPB_SMK_STS)

dds <- DESeqDataSetFromMatrix(
  countData = counts,
  colData = annotBronch,
  design = ~GRPB_SMK_STS
)
dds$GRPB_SMK_STS <- relevel(dds$GRPB_SMK_STS, ref = "Former_Smoker")

keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep, ]
dds <- DESeq(dds)
res <- results(dds)

resOrdered <- res[order(res$pvalue), ]

resOrdered <- as.data.frame(resOrdered)
resOrdered$nLogPVal <- -log10(resOrdered$padj)
resSig <- resOrdered %>%
  filter(padj < 0.05) %>%
  filter(abs(log2FoldChange) > 0.5)
numSig <- nrow(resSig)
numUp <- nrow(resSig[resSig$log2FoldChange > 0.5, ])
numDown <- nrow(resSig[resSig$log2FoldChange < -0.5, ])

labels <- sapply(rownames(resOrdered), function(x) {
  if (x %in% c("CYP1A1", "CYP1B1")) {
    return("red")
  } else {
    return("black")
  }
})

labeltxt <- sapply(rownames(resOrdered), function(x) {
  if (x %in% c("CYP1A1")) {
    return("CYP1A1")
  } else if (x %in% c("CYP1B1")) {
    return("CYP1B1")
  } else {
    return("")
  }
})

resOrderedCYP <- resOrdered[c("CYP1A1", "CYP1B1"), ]

ggplot(resOrdered, aes(x = log2FoldChange, y = nLogPVal)) +
  geom_point(size = 0.75) +
  ggplot2::theme_bw() +
  ggplot2::theme(
    panel.grid.major = ggplot2::element_blank(),
    panel.grid.minor = ggplot2::element_blank(),
    axis.text = ggplot2::element_text(size = 14),
    axis.title = ggplot2::element_text(size = 14)
  ) +
  ggplot2::theme(legend.title = ggplot2::element_text(size = 15), legend.text = ggplot2::element_text(size = 13)) +
  ggplot2::guides(
    colour =
      ggplot2::guide_legend(override.aes = list(size = 2))
  ) + geom_point(data = resSig, aes(x = log2FoldChange, y = nLogPVal), color = "Red",
                 size = 0.75) +
  geom_point(data = resOrderedCYP, aes(x = log2FoldChange, y = nLogPVal), color = "blue") +
  ggrepel::geom_text_repel(aes(x = log2FoldChange, y = nLogPVal), label = labeltxt) +
  ggplot2::labs(y = "-log10(Adjusted p-value)") +
  ggplot2::ggtitle(label = "Smoking status, Former vs Current") +
  ggplot2::theme(plot.title = ggplot2::element_text(
    hjust = 0.5,
    size = 18
  ))

# genenameTable <- as.data.frame(as.matrix(rowRanges(seBronch)$external_gene_id))
# 
# genenameTable$ensembleGeneId <- rownames(assay(seBronch))
# 
# resOrderedPreFilter <- resOrdered[match(genenameTable$V1, rownames(resOrdered)),]
# rownames(resOrderedPreFilter) <- genenameTable$ensembleGeneId
# 
# rownames(limmaRes) <- gsub(pattern = "_at", "", rownames(limmaRes))
# 
# limmaRes <- limmaRes[match(genenameTable$ensembleGeneId, rownames(limmaRes)),]
# 
# genelistIntersect <- intersect(rownames(limmaRes), rownames(resOrderedPreFilter))
# 
# limmaRes <- limmaRes[genelistIntersect,]
# resOrderedPreFilter <- resOrderedPreFilter[genelistIntersect,]
# 
# logfcTable <- as.data.frame(cbind(limmaRes$logFC, resOrderedPreFilter$log2FoldChange))
# colnames(logfcTable) <- c("Steiling", "DECAMP")
# summary(lm(DECAMP ~ Steiling, logfcTable))

Post-filter

counts <- assay(seBronchQC)
rownames(counts) <- rowRanges(seBronch)$external_gene_id

counts <- apply(counts, 1:2, function(x) {
  return(as.integer(x))
})

annotBronchQC$GRPB_SMK_STS <- gsub(" ", "_", annotBronchQC$GRPB_SMK_STS)

no.smk.status.ix <- which(is.na(annotBronchQC$GRPB_SMK_STS))

counts <- counts[, -no.smk.status.ix]
annotBronchQC <- annotBronchQC[-no.smk.status.ix, ]

dds <- DESeqDataSetFromMatrix(
  countData = counts,
  colData = annotBronchQC,
  design = ~GRPB_SMK_STS
)
dds$GRPB_SMK_STS <- relevel(dds$GRPB_SMK_STS, ref = "Former_Smoker")
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep, ]
dds <- DESeq(dds)
res <- results(dds)
# res

resOrdered <- res[order(res$pvalue), ]
# resOrdered
resOrdered <- as.data.frame(resOrdered)
resOrdered$nLogPVal <- -log10(resOrdered$padj)
resSig <- resOrdered %>%
  filter(padj < 0.05) %>%
  filter(abs(log2FoldChange) > 0.5)
numSigQC <- nrow(resSig)
numUpQC <- nrow(resSig[resSig$log2FoldChange > 0.5, ])
numDownQC <- nrow(resSig[resSig$log2FoldChange < -0.5, ])

labeltxt <- sapply(rownames(resOrdered), function(x) {
  if (x %in% c("CYP1A1")) {
    return("CYP1A1")
  } else if (x %in% c("CYP1B1")) {
    return("CYP1B1")
  } else {
    return("")
  }
})

resOrderedCYP <- resOrdered[c("CYP1A1", "CYP1B1"), ]

ggplot(resOrdered, aes(x = log2FoldChange, y = nLogPVal)) +
  geom_point(size = 0.75) +
  ggplot2::theme_bw() +
  ggplot2::theme(
    panel.grid.major = ggplot2::element_blank(),
    panel.grid.minor = ggplot2::element_blank(),
    axis.text = ggplot2::element_text(size = 14),
    axis.title = ggplot2::element_text(size = 14)
  ) +
  ggplot2::theme(legend.title = ggplot2::element_text(size = 15), legend.text = ggplot2::element_text(size = 13)) +
  ggplot2::guides(
    colour =
      ggplot2::guide_legend(override.aes = list(size = 2))
  ) + geom_point(data = resSig, aes(x = log2FoldChange, y = nLogPVal), color = "Red",
                 size = 0.75) +
  geom_point(data = resOrderedCYP, aes(x = log2FoldChange, y = nLogPVal), color = "blue") +
  ggrepel::geom_text_repel(aes(x = log2FoldChange, y = nLogPVal), label = labeltxt) +
  ggplot2::labs(y = "-log10(Adjusted p-value)") +
  ggplot2::ggtitle(label = "Smoking status, Former vs Current") +
  ggplot2::theme(plot.title = ggplot2::element_text(
    hjust = 0.5,
    size = 18
  ))

A total of 2440(883 Upregulated, 1557 Downregulated) and 2595(852 Upregulated, 1743 Downregulated) were differentially expressed pre- and post- filter, respectively(padj < 0.05, abs(log2FC) > 0.5).

Reproducibility

For reproducibility:

## datetime

Sys.time()
## [1] "2020-11-05 13:19:09 EST"
## session info

sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS:   /share/pkg.7/r/4.0.2/install/lib/libopenblasp-r0.3.7.so
## LAPACK: /share/pkg.7/r/4.0.2/install/lib/liblapack.so.3.9.0
## 
## locale:
## [1] C
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.2.1            knitr_1.29                 
##  [3] DESeq2_1.28.1               RColorBrewer_1.1-2         
##  [5] pheatmap_1.0.12             ggplot2_3.3.2              
##  [7] dplyr_1.0.1                 readxl_1.3.1               
##  [9] SummarizedExperiment_1.18.1 DelayedArray_0.14.0        
## [11] matrixStats_0.57.0          Biobase_2.48.0             
## [13] GenomicRanges_1.40.0        GenomeInfoDb_1.24.2        
## [15] IRanges_2.22.2              S4Vectors_0.26.1           
## [17] BiocGenerics_0.34.0         plyr_1.8.6                 
## [19] rmarkdown_2.3              
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.2             bit64_0.9-7            viridisLite_0.3.0     
##  [4] splines_4.0.2          highr_0.8              blob_1.2.1            
##  [7] GenomeInfoDbData_1.2.3 cellranger_1.1.0       ggrepel_0.8.2         
## [10] yaml_2.2.1             pillar_1.4.6           RSQLite_2.2.0         
## [13] lattice_0.20-41        glue_1.4.1             digest_0.6.25         
## [16] XVector_0.28.0         rvest_0.3.6            colorspace_1.4-1      
## [19] htmltools_0.5.0        Matrix_1.2-18          XML_3.99-0.3          
## [22] pkgconfig_2.0.3        genefilter_1.70.0      zlibbioc_1.34.0       
## [25] purrr_0.3.4            xtable_1.8-4           scales_1.1.1          
## [28] webshot_0.5.2          BiocParallel_1.22.0    tibble_3.0.3          
## [31] annotate_1.66.0        farver_2.0.3           generics_0.0.2        
## [34] ellipsis_0.3.1         withr_2.2.0            survival_3.1-12       
## [37] magrittr_1.5           crayon_1.3.4           memoise_1.1.0         
## [40] evaluate_0.14          xml2_1.3.2             tools_4.0.2           
## [43] lifecycle_0.2.0        stringr_1.4.0          munsell_0.5.0         
## [46] locfit_1.5-9.4         AnnotationDbi_1.50.3   compiler_4.0.2        
## [49] rlang_0.4.7            grid_4.0.2             RCurl_1.98-1.2        
## [52] rstudioapi_0.11        bitops_1.0-6           labeling_0.3          
## [55] gtable_0.3.0           DBI_1.1.0              R6_2.4.1              
## [58] bit_1.1-15.2           stringi_1.4.6          Rcpp_1.0.5            
## [61] vctrs_0.3.2            geneplotter_1.66.0     tidyselect_1.1.0      
## [64] xfun_0.16